home *** CD-ROM | disk | FTP | other *** search
- : # Use perl
- eval 'exec perl -S $0 "$@"'
- if 0;
-
- # It's faster to use something like #!/usr/bin/perl but you have to
- # know where perl is on your system. I didn't want to have to tell
- # people how to do this, so I don't distribute it that way. You
- # really ought to change it though. On my machine, it saves about a
- # half a second.
-
- #
- # txt2html.pl
- # Convert raw text to something with a little HTML formatting
- #
- # Written by Seth Golub <seth@cs.wustl.edu>
- # http://www.cs.wustl.edu/~seth/txt2html/
- #
- # $Revision: 1.16 $
- # $Date: 1996/02/06 21:26:11 $
- #
-
-
- #################################################################
- # Some initializations that need to go before the configuration
- #
-
- @links_dictionaries = 0;
-
- #
- #########################
-
-
- #########################
- # Configurable options
- #
-
- # [-s <n> ] | [--shortline <n> ]
- $short_line_length = 40; # Lines this short (or shorter) must be
- # intentionally broken and are kept
- # that short. <BR>
-
- # [-p <n> ] | [--prewhite <n> ]
- $preformat_whitespace_min = 5; # Minimum number of consecutive
- # whitespace characters to trigger
- # preformatting.
- # NOTE: Tabs are now expanded to
- # spaces before this check is made.
- # That means if $tab_width is 8 and
- # this is 5, then one tab may be
- # expanded to 8 spaces, which is
- # enough to trigger preformatting.
-
- $par_indent = 2; # Minumum number of spaces indented in
- # first lines of paragraphs.
- # Only used when there's no blank line
- # preceding the new paragraph. (Like in
- # this comment)
-
- # [-pb <n> ] | [--prebegin <n> ]
- $preformat_trigger_lines = 2; # How many lines of preformatted-looking
- # text are needed to switch to <PRE>
- # <= 0 : Preformat entire document
- # 1 : one line triggers
- # >= 2 : two lines trigger
-
- # [-pe <n> ] | [--preend <n> ]
- $endpreformat_trigger_lines = 2; # How many lines of unpreformatted-looking
- # text are needed to switch from <PRE>
- # <= 0 : Never preformat within document
- # 1 : one line triggers
- # >= 2 : two lines trigger
- # NOTE for --prebegin and --preend:
- # A zero takes precedence. If one is zero, the other is ignored.
- # If both are zero, entire document is preformatted.
-
-
- # [-r <n> ] | [--hrule <n> ]
- $hrule_min = 4; # Min number of ---s for an HRule.
-
- # [-c <n> ] | [--caps <n> ]
- $min_caps_length = 3; # min sequential CAPS for an all-caps line
-
- # [-ct <tag> ] | [--capstag <tag> ]
- $caps_tag = "STRONG"; # Tag to put around all-caps lines
-
- # [-m/+m ] | [--mail / --nomail ]
- $mailmode = 0; # Deal with mail headers & quoted text
-
- # [-u/+u ] | [--unhyphenate / --nounhyphenate ]
- $unhyphenation = 1; # Enables unhyphenation of text.
-
- # [-a <file> ] | [--append <file> ]
- # [+a ] | [--noappend ]
- $append_file = 0; # If you want something appended by
- # default, put the filename here.
- # The appended text will not be
- # processed at all, so make sure it's
- # plain text or decent HTML. i.e. do
- # not have things like:
- # Seth Golub <seth@cs.wustl.edu>
- # but instead, have:
- # Seth Golub <seth@cs.wustl.edu>
-
- # [-t <title>] | [--title <title> ]
- $title = 0; # You can specify a title.
- # Otherwise it won't put one in.
-
- # [-ul <n> ] | [--ulength <n> ]
- $underline_length_tolerance = 1; # How much longer or shorter can
- # underlines be and still be underlines?
-
- # [-uo <n> ] | [--uoffset <n> ]
- $underline_offset_tolerance = 1; # How far offset can underlines
- # be and still be underlines?
-
- # [-tw <n> ] | [--tabwidth <n> ]
- $tab_width = 8; # How many spaces equal a tab?
-
-
- # [-iw <n> ] | [--indent <n> ]
- $indent_width = 2; # Indents this many spaces for each
- # level of a list
-
- # [-/+e ] | [--extract / --noextract ]
- $extract = 0; # Extract Mode (suitable for inserting)
-
- # [-l <file> ] | [--link <dictfile> ]
- # [+l ] | [--nolink ]
- $make_links = 1; # Should we try to link anything?
-
- # [-H <regexp>] | [--heading <regexp> ]
- @custom_heading_regexp = (); # Add a regexp for headings.
- # If it contains at least one
- # subexpression, the first one will be
- # used as a style identifier. A
- # second one, if it exists, will
- # indicate what text should be kept.
- # For example, "^(:+)(.*)" would match
- # any line that started with a colon.
- # The style text would be the colons
- # (however many there were), and the
- # colons would then be removed from
- # the line. The styles are used to
- # determine the heading level. The
- # first style found in the document is
- # level 1, the second level 2, etc.
-
- # Not implemented yet.
- # [-T <t>:<r> ] | [--tag <tagname>:<regexp> ]
- @custom_tags = (); # Similar to --heading, this lets you
- # specify arbitrary patterns to tag.
- # The first subexpression, if one is
- # present, will replace the entire
- # matched text. Example:
- # "em:\*(\w+)\*" will match any word
- # surrounded by asterisks and mark it
- # as emphasized, removing the
- # asterisks.
-
- # [-db <n> ] | [--debug <n> ]
- $dict_debug = 0; # Debug mode for link dictionaries
- # Bitwise-Or what you want to see:
- # 1: The parsing of the dictionary
- # 2: The code that will make the links
-
- $system_link_dict = "/usr/local/lib/txt2html-linkdict"; # after options
- $default_link_dict = "$ENV{'HOME'}/.txt2html-linkdict"; # before options
-
-
- # Uncomment the following lines if you want to force the heading
- # styles to match what Mosaic outputs. (Underlined with "***"s is H1,
- # with "==="s is H2, etc.) This was the behavior of txt2html up to
- # version 1.10.
- #
- #$heading_styles{"*"} = ++$num_heading_styles;
- #$heading_styles{"="} = ++$num_heading_styles;
- #$heading_styles{"+"} = ++$num_heading_styles;
- #$heading_styles{"-"} = ++$num_heading_styles;
- #$heading_styles{"~"} = ++$num_heading_styles;
- #$heading_styles{"."} = ++$num_heading_styles;
-
- # END OF CONFIGURABLE OPTIONS
- ########################################
-
-
- ########################################
- # Definitions (Don't change these)
- #
-
-
- # These are just constants I use for making bit vectors to keep track
- # of what modes I'm in and what actions I've taken on the current and
- # previous lines.
-
- $NONE = 0;
- $LIST = 1;
- $HRULE = 2;
- $PAR = 4;
- $PRE = 8;
- $END = 16;
- $BREAK = 32;
- $HEADER = 64;
- $MAILHEADER = 128;
- $MAILQUOTE = 256;
- $CAPS = 512;
- $LINK =1024;
-
-
- # Constants for Ordered Lists and Unordered Lists.
- # I use this in the list stack to keep track of what's what.
-
- $OL = 1;
- $UL = 2;
-
-
- ########################################
- ########################################
- #
- # Subroutine definitions
-
- sub usage
- {
- $0 =~ s#.*/##;
- print STDERR <<EOUsage;
-
- Usage: $0 [options]
-
- where options are:
- [-v ] | [--version ]
- [-h ] | [--help ]
- [-t <title> ] | [--title <title> ]
- [-l <file> ] | [--link <dictfile> ]
- [+l ] | [--nolink ]
- [-H <regexp>] | [--heading <regexp> ]
- [-a <file> ] | [--append <file> ]
- [+a ] | [--noappend ]
- [-e/+e ] | [--extract / --noextract ]
- [-c <n> ] | [--caps <n> ]
- [-ct <tag> ] | [--capstag <tag> ]
- [-m/+m ] | [--mail / --nomail ]
- [-u/+u ] | [--unhyphen / --nounhyphen ]
- [-tw <n> ] | [--tabwidth <n> ]
- [-iw <n> ] | [--indent <n> ]
- [-ul <n> ] | [--ulength <n> ]
- [-uo <n> ] | [--uoffset <n> ]
- [-s <n> ] | [--shortline <n> ]
- [-p <n> ] | [--prewhite <n> ]
- [-pb <n> ] | [--prebegin <n> ]
- [-pe <n> ] | [--preend <n> ]
- [-r <n> ] | [--hrule <n> ]
- [-db <n> ] | [--debug <n> ]
-
- More complete explanations of these options can be found in
- comments near the beginning of the script.
-
- EOUsage
- # [-T <t>:<r> ] | [--tag <tagname>:<regexp> ]
- }
-
-
- sub deal_with_options
- {
- while ($ARGV[0] =~ /^[-+].+/)
- {
- if (($ARGV[0] eq "-l" || $ARGV[0] eq "--link") &&
- $ARGV[1])
- {
- if (-r $ARGV[1]) {
- $make_links = 1;
- # Stick it on the end of the list
- push(@links_dictionaries, $ARGV[1]);
- } else {
- print STDERR "Can't find or read link-file $ARGV[1].\n";
- }
- shift @ARGV;
- next;
- }
-
- if (($ARGV[0] eq "+l" || $ARGV[0] eq "--nolink") )
- {
- $system_link_dict = "";
- $make_links = 0;
- @links_dictionaries = 0;
- next;
- }
-
- if (($ARGV[0] eq "-H" || $ARGV[0] eq "--heading") &&
- $ARGV[1])
- {
- push(@custom_heading_regexp, $ARGV[1]);
- shift @ARGV;
- next;
- }
-
- if (($ARGV[0] eq "-T" || $ARGV[0] eq "--tag") &&
- $ARGV[1])
- {
- print STDERR "Sorry. $ARGV[0] isn't supported yet.\n";
- push(@custom_tags, $ARGV[1]);
- shift @ARGV;
- next;
- }
-
- if (($ARGV[0] eq "-r" || $ARGV[0] eq "--hrule") &&
- $ARGV[1] =~ /^\d+$/)
- {
- $hrule_min = $ARGV[1];
- shift @ARGV;
- next;
- }
-
- if (($ARGV[0] eq "-s" || $ARGV[0] eq "--shortline") &&
- $ARGV[1] =~ /^\d+$/)
- {
- $short_line_length = $ARGV[1];
- shift @ARGV;
- next;
- }
-
- if (($ARGV[0] eq "-p" || $ARGV[0] eq "--prewhite") &&
- $ARGV[1] =~ /^\d+$/)
- {
- $preformat_whitespace_min = $ARGV[1];
- shift @ARGV;
- next;
- }
-
- if (($ARGV[0] eq "-pb" || $ARGV[0] eq "--prebegin") &&
- $ARGV[1] =~ /^-?\d+$/)
- {
- $preformat_trigger_lines = $ARGV[1];
- shift @ARGV;
- next;
- }
-
- if (($ARGV[0] eq "-pe" || $ARGV[0] eq "--preend") &&
- $ARGV[1] =~ /^-?\d+$/)
- {
- $endpreformat_trigger_lines = $ARGV[1];
- shift @ARGV;
- next;
- }
-
- if (($ARGV[0] eq "-e" || $ARGV[0] eq "--extract"))
- {
- $extract = 1;
- next;
- }
-
- if (($ARGV[0] eq "+e" || $ARGV[0] eq "--noextract"))
- {
- $extract = 0;
- next;
- }
-
- if (($ARGV[0] eq "-c" || $ARGV[0] eq "--caps") &&
- $ARGV[1] =~ /^\d+$/)
- {
- $min_caps_length = $ARGV[1];
- shift @ARGV;
- next;
- }
-
- if (($ARGV[0] eq "-ct" || $ARGV[0] eq "--capstag") &&
- $ARGV[1])
- {
- $caps_tag = $ARGV[1];
- shift @ARGV;
- next;
- }
-
- if ($ARGV[0] eq "-m" || $ARGV[0] eq "--mail")
- {
- $mailmode = 1;
- next;
- }
-
- if ($ARGV[0] eq "+m" || $ARGV[0] eq "--nomail")
- {
- $mailmode = 0;
- next;
- }
-
- if ($ARGV[0] eq "-u" || $ARGV[0] eq "--unhyphen")
- {
- $unhyphenation = 1;
- next;
- }
-
- if ($ARGV[0] eq "+u" || $ARGV[0] eq "--nounhyphen")
- {
- $unhyphenation = 0;
- next;
- }
-
- if (($ARGV[0] eq "-a" || $ARGV[0] eq "--append") &&
- $ARGV[1])
- {
- if (-r $ARGV[1]) {
- $append_file = $ARGV[1];
- } else {
- print STDERR "Can't find or read $ARGV[1].\n";
- }
- shift @ARGV;
- next;
- }
-
- if ($ARGV[0] eq "+a" || $ARGV[0] eq "--noappend")
- {
- $append_file = 0;
- next;
- }
-
- if (($ARGV[0] eq "-t" || $ARGV[0] eq "--title") &&
- $ARGV[1])
- {
- $title = $ARGV[1];
- shift @ARGV;
- next;
- }
-
- if (($ARGV[0] eq "-ul" || $ARGV[0] eq "--ulength") &&
- $ARGV[1] =~ /^\d+$/)
- {
- $underline_length_tolerance = $ARGV[1];
- shift @ARGV;
- next;
- }
-
- if (($ARGV[0] eq "-uo" || $ARGV[0] eq "--uoffset") &&
- $ARGV[1] =~ /^\d+$/)
- {
- $underline_offset_tolerance = $ARGV[1];
- shift @ARGV;
- next;
- }
-
- if (($ARGV[0] eq "-tw" || $ARGV[0] eq "--tabwidth") &&
- $ARGV[1] =~ /^\d+$/)
- {
- $tab_width = $ARGV[1];
- shift @ARGV;
- next;
- }
-
- if (($ARGV[0] eq "-iw" || $ARGV[0] eq "--indentwidth") &&
- $ARGV[1] =~ /^\d+$/)
- {
- $indent_width = $ARGV[1];
- shift @ARGV;
- next;
- }
-
- if ($ARGV[0] eq "-v" || $ARGV[0] eq "--version")
- {
- print "txt2html " . '$Revision: 1.16 $ ' . "\n";
- exit;
- }
-
- if ($ARGV[0] eq "-h" || $ARGV[0] eq "--help")
- {
- &usage;
- exit;
- }
-
- if (($ARGV[0] eq "-db" || $ARGV[0] eq "--debug") &&
- $ARGV[1] =~ /^\d+$/)
- {
- $dict_debug = $ARGV[1];
- shift @ARGV;
- next;
- }
-
- if ($ARGV[0] eq "--")
- {
- last;
- }
-
- print STDERR "Unrecognized option: $ARGV[0]\n";
- print STDERR " or bad paramater: $ARGV[1]\n" if($ARGV[1]);
-
- &usage;
- exit(1);
-
- } continue {
-
- shift @ARGV;
- }
-
- $preformat_trigger_lines = 0 if ($preformat_trigger_lines < 0);
- $preformat_trigger_lines = 2 if ($preformat_trigger_lines > 2);
-
- $endpreformat_trigger_lines = 1 if ($preformat_trigger_lines == 0);
- $endpreformat_trigger_lines = 0 if ($endpreformat_trigger_lines < 0);
- $endpreformat_trigger_lines = 2 if ($endpreformat_trigger_lines > 2);
- }
-
- sub is_blank
- {
- return $_[0] =~ /^\s*$/;
- }
-
- sub escape
- {
- $line =~ s/&/&/g;
- $line =~ s/>/>/g;
- $line =~ s/</</g;
- }
-
- sub hrule
- {
- if ($line =~ /^\s*([-_~=\*]\s*){$hrule_min,}$/)
- {
- $line = "<HR>\n";
- $prev =~ s/<P>//;
- $line_action |= $HRULE;
- } elsif ($line =~ /\014/)
- {
- $line_action |= $HRULE;
- $line =~ s/\014/\n<HR>\n/g; # Linefeeds become horizontal rules
- }
- }
-
- sub shortline
- {
-
- # Short lines should be broken even on list item lines iff the
- # following line is more text. I haven't figured out how to do
- # that yet. For now, I'll just not break on short lines in lists.
- # (sorry)
-
- if (!($mode & ($PRE | $LIST))
- && !&is_blank($line)
- && !&is_blank($prev)
- && ($prev_line_length < $short_line_length)
- && !($line_action & ($END | $HEADER | $HRULE | $LIST | $PAR))
- && !($prev_action & ($HEADER | $HRULE | $BREAK)))
- {
- $prev .= "<BR>" . chop($prev);
- $prev_action |= $BREAK;
- }
- }
-
- sub mailstuff
- {
- if ((($line =~ /^\w*>/) # Handle "FF> Werewolves."
- || ($line =~ /^\w*\|/)) # Handle "Igor| There wolves."
- && !&is_blank($nextline))
- {
- $line =~ s/$/<BR>/;
- $line_action |= ($BREAK | $MAILQUOTE);
- if(!($prev_action & ($BREAK | $PAR)))
- {
- $prev .= "<P>\n";
- $line_action |= $PAR;
- }
- } elsif (($line =~ /^(From:?)|(Newsgroups:) /)
- && &is_blank($prev))
- {
- &anchor_mail if !($prev_action & $MAILHEADER);
- chop $line;
- $line = "<!-- New Message -->\n<p>\n" . $line . "<BR>\n";
- $line_action |= ($BREAK | $MAILHEADER | $PAR);
- } elsif (($line =~ /^[\w\-]*:/) # Handle "Some-Header: blah"
- && ($prev_action & $MAILHEADER)
- && !&is_blank($nextline))
- {
- $line =~ s/$/<BR>/;
- $line_action |= ($BREAK | $MAILHEADER);
- } elsif (($line =~ /^\s+\S/) && # Handle multi-line mail headers
- ($prev_action & $MAILHEADER) &&
- !&is_blank($nextline))
- {
- $line =~ s/$/<BR>/;
- $line_action |= ($BREAK | $MAILHEADER);
- }
- }
-
- # Subtracts modes listed in $mask from $vector.
- sub subtract_modes
- {
- local($vector, $mask) = @_;
- ($vector | $mask) - $mask;
- }
-
- sub paragraph
- {
- if(!&is_blank($line)
- && !($mode & $PRE)
- && !&subtract_modes($line_action, $END | $MAILQUOTE | $CAPS | $BREAK)
- && (&is_blank($prev)
- || ($line_action & $END)
- || ($line_indent > $prev_indent + $par_indent)))
- {
- $prev .= "<P>\n";
- $line_action |= $PAR;
- }
- }
-
- # If the line is blank, return the second argument. Otherwise,
- # return the number of spaces before any nonspaces on the line.
- sub count_indent
- {
- local($line, $prev_length) = @_;
- if(&is_blank($line))
- {
- return $prev_length;
- }
- local($ws) = $line =~ /^( *)[^ ]/;
- length($ws);
- }
-
- sub listprefix
- {
- local($line) = @_;
- local($prefix, $number, $rawprefix);
-
- return (0,0,0) if (!($line =~ /^\s*[-=\*o]+\s+\S/ ) &&
- !($line =~ /^\s*(\d+|[a-zA-Z])[\.\)\]:]\s+\S/ ));
-
- ($number) = $line =~ /^\s*(\d+|[a-zA-Z])/;
-
- # That slippery exception of "o" as a bullet
- # (This ought to be determined using the context of what lists
- # we have in progress, but this will probably work well enough.)
- if($line =~ /^\s*o\s/)
- {
- $number = 0;
- }
-
- if ($number)
- {
- ($rawprefix) = $line =~ /^(\s*(\d+|[a-zA-Z]).)/;
- $prefix = $rawprefix;
- $prefix =~ s/(\d+|[a-zA-Z])//; # Take the number out
- } else {
- ($rawprefix) = $line =~ /^(\s*[-=o\*]+.)/;
- $prefix = $rawprefix;
- }
- ($prefix, $number, $rawprefix);
- }
-
- sub startlist
- {
- local($prefix, $number, $rawprefix) = @_;
-
- $listprefix[$listnum] = $prefix;
- if($number)
- {
- # It doesn't start with 1,a,A. Let's not screw with it.
- if (($number != 1) && ($number ne "a") && ($number ne "A"))
- {
- return 0;
- }
- $prev .= "$list_indent<OL>\n";
- $list[$listnum] = $OL;
- } else {
- $prev .= "$list_indent<UL>\n";
- $list[$listnum] = $UL;
- }
-
- $listnum++;
- $list_indent = " " x $listnum x $indent_width;
- $line_action |= $LIST;
- $mode |= $LIST;
- 1;
- }
-
-
- sub endlist # End N lists
- {
- local($n) = @_;
- for(; $n > 0; $n--, $listnum--)
- {
- $list_indent = " " x ($listnum-1) x $indent_width;
- if($list[$listnum-1] == $UL)
- {
- $prev .= "$list_indent</UL>\n";
- } elsif($list[$listnum-1] == $OL)
- {
- $prev .= "$list_indent</OL>\n";
- } else
- {
- print STDERR "Encountered list of unknown type\n";
- }
- }
- $line_action |= $END;
- $mode ^= ($LIST & $mode) if (!$listnum);
- }
-
- sub continuelist
- {
- $line =~ s/^\s*[-=o\*]+\s*/$list_indent<LI> / if $list[$listnum-1] == $UL;
- $line =~ s/^\s*(\d+|[a-zA-Z]).\s*/$list_indent<LI> /
- if $list[$listnum-1] == $OL;
- $line_action |= $LIST;
- }
-
- sub liststuff
- {
- local($i);
-
- local($prefix, $number, $rawprefix) = &listprefix($line);
-
- $i = $listnum;
- if (!$prefix)
- {
- return if !&is_blank($prev); # inside a list item
-
- # This ain't no list. We'll want to end all of them.
- return if !($mode & $LIST); # This just speeds up the inevitable
- $i = 0;
- } else
- {
- # Maybe we're going back up to a previous list
- $i-- while (($prefix ne $listprefix[$i-1]) && ($i >= 0));
- }
-
- local($islist);
-
- # Measure the indent from where the text starts, not where the
- # prefix starts. This won't screw anything up, and if we don't do
- # it, the next line might appear to be indented relative to this
- # line, and get tagged as a new paragraph.
- local($total_prefix) = $line =~ /^(\s*[\da-zA-Z-=o\*]+.\s*)/;
- # Of course, we only use it if it really turns out to be a list.
-
- $islist = 1;
- if (($i >= 0) && ($i != $listnum))
- {
- &endlist($listnum - $i);
- $islist = 0;
- } elsif (!$listnum || $i != $listnum)
- {
- if ( ($line_indent > 0) || &is_blank($prev) )
- {
- $islist = &startlist($prefix, $number, $rawprefix);
- } else
- {
- # We have something like this: "- foo" which usually
- # turns out not to be a list.
- return;
- }
- }
-
- &continuelist($prefix, $number, $rawprefix) if ($mode & $LIST);
- $line_indent = length($total_prefix) if $islist;
- }
-
- # Returns true if the passed string is considered to be preformatted
- sub is_preformatted
- {
- ((@_[0] =~ /\s{$preformat_whitespace_min,}\S+/o) # whitespaces
- || (@_[0] =~ /\.{$preformat_whitespace_min,}\S+/o)); # dots
- }
-
- sub endpreformat
- {
- if(!&is_preformatted($line)
- && ($endpreformat_trigger_lines == 1
- || !&is_preformatted($nextline)))
- {
- $prev .= "</PRE>\n";
- $mode ^= ($PRE & $mode);
- $line_action |= $END;
- }
- }
-
- sub preformat
- {
- if($preformat_trigger_lines == 0
- || (&is_preformatted($line) &&
- ($preformat_trigger_lines == 1 || &is_preformatted($nextline))))
- {
- $line =~ s/^/<PRE>\n/;
- $prev =~ s/<P>//;
- $mode |= $PRE;
- $line_action |= $PRE;
- }
- }
-
- sub make_new_anchor
- {
- local($anchor, $i);
-
- return sprintf("%d", $non_header_anchor++) if(!$heading_level);
- $anchor = "section-";
- $heading_count[$heading_level-1]++;
-
- # Reset lower order counters
- for($i=$#heading_count + 1; $i > $heading_level; $i--)
- {
- $heading_count[$i-1] = 0;
- }
-
- for($i=0; $i < $heading_level; $i++)
- {
- $heading_count[$i] = 1 if !$heading_count[$i]; # In case they skip any
- $anchor .= sprintf("%d.", $heading_count[$i]);
- }
- chop($anchor);
- $anchor;
- }
-
- sub anchor_mail
- {
- local($text) = $line =~ /\S+: *(.*) *$/;
- local($anchor) = &make_new_anchor($text);
- $line =~ s/(.*)/<A NAME="$anchor">$1<\/A>/;
- }
-
- sub anchor_heading
- {
- local($heading) = @_;
- local($anchor) = &make_new_anchor($heading);
- $line =~ s/(<H.>.*<\/H.>)/<A NAME="$anchor">$1<\/A>/;
- }
-
- sub heading_level
- {
- local($style) = @_;
- $heading_styles{$style} = ++$num_heading_styles
- if !$heading_styles{$style};
- $heading_styles{$style};
- }
-
- sub heading
- {
- local($hoffset, $heading) = $line =~ /^(\s*)(.+)$/;
- local($uoffset, $underline) = $nextline =~ /^(\s*)(\S+)\s*$/;
-
- local($lendiff, $offsetdiff);
- $lendiff = length($heading) - length($underline);
- $lendiff *= -1 if $lendiff < 0;
-
- $offsetdiff = length($hoffset) - length($uoffset);
- $offsetdiff *= -1 if $offsetdiff < 0;
-
- if(&is_blank($line)
- ||($lendiff > $underline_length_tolerance)
- ||($offsetdiff > $underline_offset_tolerance))
- {
- return;
- }
-
- $underline = substr($underline,0,1);
-
- $underline .= "C" if &iscaps($line); # Call it a different style if the
- # heading is in all caps.
- $nextline = &getline; # Eat the underline
- $heading_level = &heading_level($underline);
- &tagline("H" . $heading_level);
- &anchor_heading($heading);
- $line_action |= $HEADER;
- }
-
- sub custom_heading
- {
- local($i);
- for($i=0; $i <= $#custom_heading_regexp; $i++)
- {
- if ($line =~ /$custom_heading_regexp[$i]/)
- {
- ($style, $rest) = ($line =~ /$custom_heading_regexp[$i]/);
- $line = ($rest . "\n") if $rest;
- $style || ($style = $i);
- &tagline("H" . &heading_level("Cust" . $style));
- &anchor_heading($rest);
- $line_action |= $HEADER;
- last;
- }
- }
- }
-
- sub unhyphenate
- {
- local($second);
-
- # This looks hairy because of all the quoted characters.
- # All I'm doing is pulling out the word that begins the next line.
- # Along with it, I pull out any punctuation that follows.
- # Preceding whitespace is preserved. We don't want to screw up
- # our own guessing systems that rely on indentation.
- ($second) = $nextline =~ /^\s*([a-zA-Z]+[\)\}\]\.,:;\'\"\>]*\s*)/; # "
- $nextline =~ s/^(\s*)[a-zA-Z]+[\)\}\]\.,:;\'\"\>]*\s*/$1/; # "
- # (The silly comments are for my less-than-perfect code hilighter)
-
- $line =~ s/\-\s*$/$second/;
- $line .= "\n";
- }
-
- sub untabify
- {
- local($line) = @_;
- while($line =~ /\011/)
- {
- $line =~ s/\011/" " x ($tab_width - (length($`) % $tab_width))/e;
- }
- $line;
- }
-
- sub tagline
- {
- local($tag) = @_;
- chop $line; # Drop newline
- $line =~ s/^\s*(.*)$/<$tag>$1<\/$tag>\n/;
- }
-
- sub iscaps
- {
- local($_) = @_;
- /^[^a-z<]*[A-Z]{$min_caps_length,}[^a-z<]*$/;
- }
-
- sub caps
- {
- if(&iscaps($line))
- {
- &tagline($caps_tag);
- $line_action |= $CAPS;
- }
- }
-
- # Convert very simple globs to regexps
- sub glob2regexp
- {
- local($glob) = @_;
- # Escape funky chars
- $glob =~ s/[^\w\[\]\*\?\|\\]/\\$&/g;
- local($regexp,$i,$len,$escaped) = ("",0,length($glob),0);
-
- for(;$i < $len; $i++)
- {
- $char = substr($glob,$i,1);
- if($escaped)
- {
- $escaped = 0;
- $regexp .= $char;
- next;
- }
- if ($char eq "\\") {
- $escaped = 1; next;
- $regexp .= $char;
- }
- if ($char eq "?") {
- $regexp .= "."; next;
- }
- if ($char eq "*") {
- $regexp .= ".*"; next;
- }
- $regexp .= $char; # Normal character
- }
- "\\b" . $regexp . "\\b";
- }
-
- sub add_regexp_to_links_table
- {
- local($key,$URL,$switches) = @_;
- # No sense adding a second one if it's already in there.
- # It would never get used.
- if(!$links_table{$key})
- {
- # Keep track of the order they were added so we can
- # look for matches in the same order
- push(@links_table_order, ($key));
-
- $links_table{$key} = $URL; # Put it in The Table
- $links_switch_table{$key} = $switches;
- print STDERR
- " ($#links_table_order)\tKEY: $key\n\tVALUE: $URL\n\tSWITCHES: $switches\n\n"
- if ($dict_debug & 1);
- } else
- {
- if($dict_debug & 1) {
- print STDERR " Skipping entry. Key already in table.\n";
- print STDERR "\tKEY: $key\n\tVALUE: $URL\n\n";
- }
- }
- }
-
- sub add_literal_to_links_table
- {
- local($key,$URL,$switches) = @_;
- $key =~ s/(\W)/\\\1/g; # Escape non-alphanumeric chars
- $key = "\\b$key\\b"; # Make a regexp out of it
- &add_regexp_to_links_table($key,$URL,$switches);
- }
-
- sub add_glob_to_links_table
- {
- local($key,$URL,$switches) = @_;
- &add_regexp_to_links_table(&glob2regexp($key),$URL,$switches);
- }
-
- # This is the only function you should need to change if you want to
- # use a different dictionary file format.
- sub parse_dict
- {
- local($dictfile, $dict) = @_;
-
- print STDERR "Parsing dictionary file $dictfile\n" if ($dict_debug & 1);
-
- $dict =~ s/^\#.*$//g; # Strip lines that start with '#'
- $dict =~ s/^.*[^\\]:\s*$//g; # Strip lines that end with unescaped ':'
-
- if($dict =~ /->\s*->/)
- {
- $message = "Two consecutive '->'s found in $dictfile\n";
-
- # Print out any useful context so they can find it.
- ($near) = $dict =~ /([\S ]*\s*->\s*->\s*\S*)/;
- $message .= "\n$near\n" if $near =~ /\S/;
- die $message;
- }
-
- while($dict =~ /\s*(.+)\s+\-+([ieh]+\-+)?\>\s*(.*\S+)\s*\n/ig)
- {
- local($key, $URL,$switches,$options);
- $key = $1;
- $options = $2;
- $URL = $3;
- $switches = 0;
- $switches += 1 if $options =~ /i/i; # Case insensitivity
- $switches += 2 if $options =~ /e/i; # This could be interesting
- $switches += 4 if $options =~ /h/i; # provides HTML, not just URL
-
- $key =~ s/\s*$//; # Chop trailing whitespace
-
- if($key =~ m|^/|) # Regexp
- {
- $key = substr($key,1);
- $key =~ s|/$||; # Allow them to forget the closing /
- &add_regexp_to_links_table($key,$URL,$switches);
- } elsif($key =~ /^\|/) # alternate regexp format
- {
- $key = substr($key,1);
- $key =~ s/\|$//; # Allow them to forget the closing |
- $key =~ s|/|\\/|g; # Escape all slashes
- &add_regexp_to_links_table($key,$URL,$switches);
- } elsif ($key =~ /\"/)
- {
- $key = substr($key,1);
- $key =~ s/\"$//; # Allow them to forget the closing "
- &add_literal_to_links_table($key,$URL,$switches);
- } else
- {
- &add_glob_to_links_table($key,$URL,$switches);
- }
- }
- }
-
- sub in_link_context
- {
- local($match, $before) = @_;
- return 1 if $match =~ m@</?A>@i; # No links allowed inside match
-
- local($final_open, $final_close);
- $final_open = rindex($before, "<A ") - $[;
- $final_close = rindex($before, "</A>") - $[;
-
- ($final_open >= 0) # Link opened
- && (($final_close < 0) # and not closed or
- || ($final_open > $final_close)); # one opened after last close
- }
-
- # This subroutine looks a little odd. Rather than build up some code
- # and keep "eval"ing later, I'm building a new subroutine. This way I
- # can declare local vars and not worry about the namespace in the
- # calling context. I don't know how much it really gains me, but I
- # don't know of any real costs and it seems like it could be
- # friendlier to optimization. (Plus it's cool to define new
- # subroutines at runtime. :-)
- sub make_dictionary_links_code
- {
- local($i,$pattern,$switches,$options,$code,$href);
- $code = <<EOCode;
- sub dynamic_make_dictionary_links
- {
- local(\$line_link) = (\$line_action | \$LINK);
- local(\$before,\$linkme,\$line_with_links);
- EOCode
- for($i=1; $i <= $#links_table_order; $i++)
- {
- $pattern = $links_table_order[$i];
- $key = $pattern;
- $switches = $links_switch_table{$key};
-
- $s_sw = ""; # Options for searching
- $s_sw .= "i" if($switches & 1);
-
- $r_sw = ""; # Options for replacing
- $r_sw .= "i" if($switches & 1);
- $r_sw .= "e" if($switches & 2);
-
- $href = $links_table{$key};
-
- $href =~ s@/@\\/@g;
- $href = '<A HREF="' . $href . '">$&<\\/A>'
- if !($switches & 4);
-
- $code .= <<EOCode;
-
- \$line_with_links = "";
- while(\$line =~ /$pattern/$s_sw)
- {
- \$link_line = $LINK if(!\$link_line);
- \$before = \$\`;
- \$linkme = \$&;
-
- \$line = substr(\$line, length(\$before) + length(\$linkme));
- \$linkme =~ s/$pattern/$href/$r_sw
- if(!&in_link_context(\$linkme,\$line_with_links . \$before));
- \$line_with_links .= \$before . \$linkme;
- }
- \$line = \$line_with_links . \$line;
- EOCode
- }
- $code .= <<EOCode;
-
- \$line_action |= \$line_link; # Cheaper to only to do bitwise OR once.
- }
- EOCode
- print STDERR "$code" if ($dict_debug & 2);
- eval "$code";
- if($@)
- {
- print STDERR "Problem making dictionary eval code\n";
- die $@;
- }
- }
-
- sub load_dictionary_links
- {
- local($dict, $contents);
- local($i);
- @links_table_order = 0;
- %links_table = 0;
- for($i=1; $i <= $#links_dictionaries; $i++)
- {
- $dict= $links_dictionaries[$i];
-
- (-r "$dict") || die "Dictionary file $dict not found or unreadable";
- open(DICT, "$dict") || die "Can't open Dictionary file $dict";
-
- $contents = "";
- $contents .= $_ while(<DICT>);
- close(DICT);
- &parse_dict($dict, $contents);
- }
- &make_dictionary_links_code;
- }
-
- sub make_dictionary_links
- {
- eval "&dynamic_make_dictionary_links;";
- warn $@ if $@;
- }
-
- sub getline
- {
- local($line);
- $line = <>;
- $line =~ s/[ \011]*\015$//; # Chop trailing whitespace and DOS CRs
- $line = &untabify($line); # Change all tabs to spaces
- $line;
- }
-
- sub main
- {
- $* = 1; # Turn on multiline searches
- push(@links_dictionaries,($default_link_dict))
- if ($make_links && (-f $default_link_dict));
- &deal_with_options;
- if($make_links)
- {
- push(@links_dictionaries,($system_link_dict)) if -f $system_link_dict;
- &load_dictionary_links;
- }
- if(!$extract)
- {
- print "<HTML>\n";
- print "<HEAD>\n";
-
- # It'd be nice if we could guess a title from the first header,
- # but even that would be too late if we're doing this in one pass.
- print "<TITLE>$title</TITLE>\n" if($title);
-
- print "</HEAD>\n";
-
- print "<BODY>\n";
- }
-
- $prev_line_length = 0;
- $prev_indent = 0;
- $prev = "";
- $line = &getline;
- $nextline = &getline if $line;
- $line = &untabify($line);
- do
- {
- $line_length = length($line); # Do this before tags go in
- $line_indent = &count_indent($line, $prev_indent);
-
- &escape;
-
- &endpreformat if (($mode & $PRE) && ($preformat_trigger_lines != 0));
-
- &hrule if !($mode & $PRE);
-
- &liststuff if (!($mode & $PRE) &&
- !&is_blank($line));
-
- &custom_heading if (($#custom_heading_regexp > -1)
- && !($mode & $PRE));
-
- &heading if (!($mode & ($PRE | $HEADER)) &&
- $nextline =~ /^\s*[=\-\*\.~\+]+\s*$/);
-
- # &custom_tag if (($#custom_tags > -1)
- # && !($mode & $PRE)
- # && !($line_action & $HEADER));
-
- &mailstuff if ($mailmode &&
- !($mode & $PRE) &&
- !($line_action & $HEADER));
-
- &preformat if (!($line_action & ($HEADER | $LIST | $MAILHEADER)) &&
- !($mode & ($LIST | $PRE)) &&
- ($endpreformat_trigger_lines != 0));
-
- ¶graph;
- &shortline;
-
- &unhyphenate if ($unhyphenation &&
- ($line =~ /[a-zA-Z]\-$/) && # ends in hyphen
- # next line starts w/letters
- ($nextline =~ /^\s*[a-zA-Z]/) &&
- !($mode & ($PRE | $HEADER | $MAILHEADER | $BREAK)));
-
- &make_dictionary_links if ($make_links
- && !&is_blank($line)
- && $#links_table_order);
-
- &caps if !($mode & $PRE);
-
- # Print it out and move on.
-
- print $prev;
-
- if (!&is_blank($nextline))
- {
- $prev_action = $line_action;
- $line_action = $NONE;
- $prev_line_length = $line_length;
- $prev_indent = $line_indent;
- }
-
- $prev = $line;
- $line = $nextline;
- $nextline = &getline if $nextline;
- } until (!$nextline && !$line && !$prev);
-
- $prev = "";
- &endlist($listnum) if ($mode & $LIST); # End all lists
- print $prev;
-
- print "\n";
-
- print "</PRE>\n" if ($mode & $PRE);
-
- if ($append_file)
- {
- if(-r $append_file)
- {
- open(APPEND, $append_file);
- print while <APPEND>;
- } else {
- print STDERR "Can't find or read file $append_file to append.\n";
- }
- }
-
- if(!$extract)
- {
- print "</BODY>\n";
- print "</HTML>\n";
- }
- }
-
- &main();
-
-